import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd   
import pickle

   
# Load the dataset   
credit_customers = pd.read_csv(os.path.join(sys.argv[1], 'credit_customers.csv')) 
  
# Descriptive statistics for age   

age_mean = credit_customers['age'].mean()   

age_median = credit_customers['age'].median()   

age_mode = credit_customers['age'].mode()[0]   

age_range = credit_customers['age'].max() - credit_customers['age'].min()
  
print(f"Age - Mean: {age_mean}, Median: {age_median}, Mode: {age_mode}, Range: {age_range}")  
# pickle.dump(age_mean, open("./ref_result/age_mean.pkl","wb"))
# pickle.dump(age_median, open("./ref_result/age_median.pkl","wb"))
# pickle.dump(age_mode, open("./ref_result/age_mode.pkl","wb"))
# pickle.dump(age_range, open("./ref_result/age_range.pkl","wb"))

  

import pandas as pd   
import matplotlib.pyplot as plt   
import seaborn as sns

# Load the dataset   
  
# Distribution plot for age  
plt.figure(figsize=(10, 6))   
sns.histplot(credit_customers['age'], kde=True, bins=20)   
plt.title("Age Distribution")   
plt.xlabel("Age")   
plt.ylabel("Frequency")   
plt.savefig('ref_result/histplot.png')
# plt.show()  


import pandas as pd   
import pickle

# Load the dataset   
  
# Distribution plot for age  
employment_counts = credit_customers['employment'].value_counts()
print("\nEmployment Status Distribution:")  
print(employment_counts) 
# pickle.dump(employment_counts,open("./ref_result/employment_counts.pkl","wb"))

import pandas as pd  
import matplotlib.pyplot as plt  
import seaborn as sns  
   
# Load the dataset   
   
# Distribution plot for employment status  
plt.figure(figsize=(10, 6))   
sns.countplot(x='employment', data=credit_customers, order=credit_customers['employment'].value_counts().index)   
plt.title("Employment Status Distribution")   
plt.xlabel("Employment Status")   
plt.ylabel("Frequency")   
plt.savefig('ref_result/countplot.png')  
# plt.show()  


import pandas as pd   
import pickle

# Load the dataset   
  

credit_history_counts = credit_customers['credit_history'].value_counts()   
print("\nCredit History Distribution:")  
print(credit_history_counts)  
# pickle.dump(credit_history_counts,open("./ref_result/credit_history_counts.pkl","wb"))

import pandas as pd   
import matplotlib.pyplot as plt  
import seaborn as sns  

   
# Load the dataset   

  
# Distribution plot for credit history  
plt.figure(figsize=(10, 6))   
sns.countplot(x='credit_history', data=credit_customers, order=credit_customers['credit_history'].value_counts().index)   
plt.title("Credit History Distribution")   
plt.xlabel("Credit History")   
plt.ylabel("Frequency")   
plt.xticks(rotation=45)    
plt.savefig('ref_result/countplot_2.png') 
# plt.show()  


import pandas as pd  
import scipy.stats as stats  
import pickle 
  
# Load the dataset  
  
# Create a new column to identify younger customers (aged 18-35)  
credit_customers['young_customer'] = (credit_customers['age'] >= 18) & (credit_customers['age'] <= 35)

print("credit_young_customer")
print(credit_customers['young_customer'])  
# pickle.dump(credit_customers['young_customer'],open("./ref_result/credit_young_customer.pkl","wb"))

import pandas as pd  
import pickle
  
# Load the dataset  
  
good_credit_history = credit_customers['credit_history'].isin(['existing paid', 'no credits/all paid', 'all paid'])  
  
print(good_credit_history)  
# pickle.dump(good_credit_history,open("./ref_result/good_credit_history.pkl","wb"))

import pandas as pd  
import pickle
  
# Load the dataset  
  
# Create groups based on young_customer and good_credit_history  
group1 = credit_customers.loc[credit_customers['young_customer'] & good_credit_history, 'credit_amount']  
group2 = credit_customers.loc[~credit_customers['young_customer'] & good_credit_history, 'credit_amount']  
group3 = credit_customers.loc[credit_customers['young_customer'] & ~good_credit_history, 'credit_amount']  
group4 = credit_customers.loc[~credit_customers['young_customer'] & ~good_credit_history, 'credit_amount']  
  
 
# pickle.dump(group1,open("./ref_result/group1.pkl","wb"))
# pickle.dump(group2,open("./ref_result/group2.pkl","wb"))
# pickle.dump(group3,open("./ref_result/group3.pkl","wb"))
# pickle.dump(group4,open("./ref_result/group4.pkl","wb"))

import pandas as pd 
import scipy.stats as stats  
import pickle

# Load the dataset  

# Create groups based on young_customer and good_credit_history  
f_statistic, p_value = stats.f_oneway(group1, group2, group3, group4) 
  
print(f"F-statistic: {f_statistic}, P-value: {p_value}")  
# pickle.dump(f_statistic,open("./ref_result/f_statistic.pkl","wb"))
# pickle.dump(p_value,open("./ref_result/p_value.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import LabelEncoder   
import pickle 
  
# Load the dataset  
  
le = LabelEncoder()   
categorical_columns = credit_customers.select_dtypes(include=['object']).columns   
for column in categorical_columns:   
    credit_customers[column] = le.fit_transform(credit_customers[column])   

print("credit_customers")
print(credit_customers)  
# pickle.dump(credit_customers,open("./ref_result/credit_customers.pkl","wb"))

import pandas as pd   
import matplotlib.pyplot as plt   
import seaborn as sns

# Load the dataset   
  
corr_matrix = credit_customers.corr()   
plt.figure(figsize=(12, 10))   
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')   
plt.title("Correlation Matrix")   
plt.savefig('ref_result/Correlation_Matrix.png')
# plt.show()  


import pandas as pd   
from sklearn.model_selection import train_test_split  
import pickle
from sklearn.linear_model import LogisticRegression

# Load the dataset   
  
X = credit_customers.drop('class', axis=1)   
y = credit_customers['class']   

# Split data into training and testing sets   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  

log_reg = LogisticRegression(max_iter=1000)   
log_reg.fit(X_train, y_train)    

# pickle.dump(X_train,open("./ref_result/X_train.pkl","wb"))
# pickle.dump(X_test,open("./ref_result/X_test.pkl","wb"))
# pickle.dump(y_train,open("./ref_result/y_train.pkl","wb"))
# pickle.dump(y_test,open("./ref_result/y_test.pkl","wb"))
# pickle.dump(log_reg,open("./ref_result/log_reg.pkl","wb"))

import pandas as pd   
from sklearn.metrics import classification_report, confusion_matrix 
import pickle

# Load the dataset   
  
# Predict on test set   
y_pred = log_reg.predict(X_test)   

# Evaluate the model   
print(classification_report(y_test, y_pred))   
print(confusion_matrix(y_test, y_pred))     

# pickle.dump(classification_report(y_test, y_pred),open("./ref_result/classification_report.pkl","wb"))
# pickle.dump(confusion_matrix(y_test, y_pred),open("./ref_result/confusion_matrix.pkl","wb"))



import pandas as pd   
import pickle

# Load the dataset   
  
feature_importances = pd.DataFrame(log_reg.coef_[0], index=X.columns, columns=['importance']).sort_values('importance', ascending=False)   
print("\nFeature Importances:")   
print(feature_importances)  
pickle.dump(feature_importances,open("./ref_result/feature_importances.pkl","wb"))
